library(CARBayesST)
library(plotly)
library(tidyverse)
DATA_PATH <- file.path('data')
TRAINING_YEARS <- 2006:2017
VALIDATION_YEAR <- 2018
TEST_YEAR <- 2019
TERRITORY_FIPS <- c('60', '66', '69', '72', '78')
The main data set containing drug overdose death rates by county.
overdose <- read_csv(
file.path(DATA_PATH,
'NCHS_-_Drug_Poisoning_Mortality_by_County__United_States.csv')
) %>%
transmute(
year = Year,
fips = str_pad(FIPS, width = 5, side = 'left', pad = '0'),
county = County,
death_rate = `Model-based Death Rate`
)
The National Bureau of Economic research provides a
csv containing an edge-list representation of counties sharing a
border. We filter to exclude FIPS codes demarcating US territories but
include the District of Columbia. The data are transformed to an
adjacency matrix county_adj_matrix, which we will use to
control for spatial autocorrelation in the model below.
county_adj <- read_csv(
file.path(DATA_PATH, 'county_adjacency2010.csv'),
col_select = c(fipscounty, fipsneighbor)
) %>%
filter(
fipsneighbor != fipscounty,
!str_sub(fipscounty, 1, 2) %in% TERRITORY_FIPS
) %>%
arrange(fipscounty, fipsneighbor)
fips <- unique(county_adj$fipscounty)
county_adj_matrix <- matrix(integer(length(fips)^2), nrow = length(fips),
dimnames = list(fips, fips))
county_adj_matrix[as.matrix(county_adj)] <- 1L
The industry concentration data are derived from the County Business Patterns Survey published by the US Census bureau. In particular, we are looking at the so-called location quotient, which is a ratio of share of employment within a county against the national share. This statistic gives a measure of specialization within a county. Work with feature is exploratory and is subject to change. In particular, we suspect that working directly with the more interpretable share of employment within a county might give similar results. This sensitivity analysis will come in future work.
industry_clusters <- c('Coal.Mining_Cluster.LQ')
has_high_lq <- function(x, p = 0.75) {
return(as.numeric(x >= quantile(x[x > 0], probs = p, na.rm = TRUE)))
}
industry <- read_csv(file.path(DATA_PATH, 'reduced_ind_conc_data.csv')) %>%
mutate(fips = str_pad(region, width = 5, side = 'left', pad = 0)) %>%
select(year, fips, all_of(industry_clusters)) %>%
mutate(across(all_of(industry_clusters), has_high_lq, .names = "high_{.col}"))
The data come from the Census SAIPE Model, which regresses the number of people in poverty on the number of tax returns with gross incomes falling below the official poverty threshold, the number of SNAP benefits in July of the previous year, the estimated resident population as of July 1, the total number of tax return exemptions, and the Census 2000 estimate of the number of people in poverty (within categories).
poverty <- arrow::read_parquet(file.path(DATA_PATH, 'Poverty_Data.parquet')) %>%
transmute(
year = as.numeric(Year),
fips = str_pad(`County FIPS Code`, width = 5, side = 'left', pad = '0'),
state = `State FIPS`,
pov_percent_all = `Poverty Percent, All Ages`,
median_income = `Median Household Income`,
pov_percent_minors = `Poverty Percent, Age 5-17 in Families`
)
df <- expand_grid(year = c(TRAINING_YEARS, VALIDATION_YEAR), fips) %>%
left_join(overdose, by = c('year', 'fips')) %>%
left_join(poverty, by = c('year', 'fips')) %>%
left_join(industry, by = c('year', 'fips')) %>%
mutate(
state = str_sub(fips, 1, 2),
masked_death_rate = case_when(year != VALIDATION_YEAR ~ death_rate),
# Impute missing industry indicators as 0
high_Coal.Mining_Cluster.LQ = replace_na(high_Coal.Mining_Cluster.LQ, 0)
) %>%
group_by(year, state) %>%
mutate(
# Impute missing poverty rates with yearly mean by state
pov_percent_all = replace_na(pov_percent_all,
mean(pov_percent_all, na.rm = TRUE)),
pov_percent_minors = replace_na(pov_percent_minors,
mean(pov_percent_all, na.rm = TRUE)),
median_income = replace_na(median_income,
median(pov_percent_all, na.rm = TRUE)),
) %>%
ungroup() %>%
arrange(year, fips)
head(df)
## # A tibble: 6 x 11
## year fips county death_rate state pov_percent_all median_income
## <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 2006 01001 Autauga County, AL 6.60 01 12.5 46491
## 2 2006 01003 Baldwin County, AL 12.1 01 11 45493
## 3 2006 01005 Barbour County, AL 3.31 01 28.7 28692
## 4 2006 01007 Bibb County, AL 14.3 01 18.8 36794
## 5 2006 01009 Blount County, AL 10.6 01 12.8 41494
## 6 2006 01011 Bullock County, AL 4.76 01 32.9 23095
## # ... with 4 more variables: pov_percent_minors <dbl>,
## # Coal.Mining_Cluster.LQ <dbl>, high_Coal.Mining_Cluster.LQ <dbl>,
## # masked_death_rate <dbl>
We fit the generalized linear mixed model to the data:
\[\begin{align*} Y_{kt} &\sim f(y_{kt} \vert \mu_{kt}, \nu^2)\\ g(\mu_{kt}) &= \mathbf{x}'_{kt} \mathbf{\beta} + \psi_{kt}\\ \mathbf{\beta} &\sim N(\mathbf{\mu}_{\beta}, \mathbf{\Sigma}_{\beta})\\ \psi_{kt} &= \beta_1 + \phi_k + (\alpha + \delta_k) \frac{t - \bar{t}}{N}\\ \phi_k \vert \mathbf{\phi}_{-k}, \mathbf{W} &\sim N \left( \frac{\rho_{int}\sum_{j=1}^K w_{kj} \phi_j}{\rho_{int}\sum_{j=1}^K w_{kj} + 1 - \rho_{int}}, \frac{\tau_{int}^2}{\rho_{int}\sum_{j=1}^K w_{kj} + 1 - \rho_{int}} \right) \\ \delta_k \vert \mathbf{\delta}_{-k}, \mathbf{W} &\sim N \left( \frac{\rho_{slo}\sum_{j=1}^K w_{kj} \delta_j}{\rho_{slo}\sum_{j=1}^K w_{kj} + 1 - \rho_{slo}}, \frac{\tau_{slo}^2}{\rho_{slo}\sum_{j=1}^K w_{kj} + 1 - \rho_{slo}} \right) \\ \tau_{int}^2, \tau_{slo}^2 &\sim \text{Inverse-Gamma}(a,b) \\ \rho_{int}, \rho_{slo} &\sim \text{Uniform}(0, 1) \\ \alpha &\sim N(\mu_{\alpha}, \sigma^2_{\alpha}) \end{align*}\]
where
This model is meant to provide us with a benchmark of run-time and direction on building a diagnostics framework.
poverty_coal_model <- ST.CARlinear(
masked_death_rate ~
median_income +
pov_percent_all +
pov_percent_minors +
high_Coal.Mining_Cluster.LQ,
family = "gaussian", data = df, W = county_adj_matrix,
burnin = 5000, n.sample = 10000, thin = 2
)
df <- df %>%
mutate(
fitted = fitted(poverty_coal_model),
residuals = death_rate - fitted
)
poverty_coal_model$summary.results
## Median 2.5% 97.5% n.sample % accept
## (Intercept) 17.2751 16.5582 18.0004 2500 100.0
## median_income 0.0000 0.0000 0.0000 2500 100.0
## pov_percent_all -0.0633 -0.0881 -0.0398 2500 100.0
## pov_percent_minors -0.1367 -0.1534 -0.1184 2500 100.0
## high_Coal.Mining_Cluster.LQ 0.7525 -0.0584 1.5250 2500 100.0
## alpha 9.1997 9.0270 9.3746 2500 100.0
## tau2.int 100.5614 95.3633 106.1506 2500 100.0
## tau2.slo 84.8537 78.9604 90.7896 2500 100.0
## nu2 5.8817 5.7904 5.9760 2500 100.0
## rho.int 0.9904 0.9771 0.9977 2500 45.6
## rho.slo 0.9879 0.9719 0.9973 2500 45.0
## n.effective Geweke.diag
## (Intercept) 202.6 -1.5
## median_income 122.7 2.5
## pov_percent_all 551.8 -0.5
## pov_percent_minors 530.2 -1.0
## high_Coal.Mining_Cluster.LQ 615.1 -0.7
## alpha 190.1 -1.9
## tau2.int 846.7 2.4
## tau2.slo 1399.5 1.4
## nu2 2077.9 -0.5
## rho.int 954.6 0.0
## rho.slo 1100.3 0.2
As a sanity check, we see that averaged over time, our training residuals averaged over time within counties hover around 0.
counties <- rjson::fromJSON(
file = str_glue('https://raw.githubusercontent.com/',
'plotly/datasets/master/geojson-counties-fips.json'))
plot_ly() %>%
add_trace(
data = df %>%
filter(year %in% TRAINING_YEARS) %>%
group_by(fips) %>%
summarise(residuals = mean(residuals, na.rm = TRUE)),
type = "choropleth",
geojson = counties,
locations = ~fips,
z = ~residuals,
zmin = -4,
zmax = 4,
colors = "PRGn",
marker = list(
line = list(width=0)
),
colorbar = list(
title = "Residual",
tickmode = "array",
tickvals = -4:4,
ticktext = c("Overprediction", '-3', '-2', '-1', '0', '1', '2', '3', 'Underprediction')
)
) %>%
layout(
title = "Average (over time) Training Error",
geo = list(
scope = 'usa',
projection = list(type = 'alvers usa')
)
)
Our prediction errors, on the other hand, still shows some spatial clustering with clumps of overprediction and underprediction.
plot_ly() %>%
add_trace(
data = df %>%
filter(year == VALIDATION_YEAR),
type = "choropleth",
geojson = counties,
locations = ~fips,
z = ~residuals,
zmin = -4,
zmax = 4,
colors = "PRGn",
marker = list(
line = list(width=0)
),
colorbar = list(
title = "Residual",
tickmode = "array",
tickvals = -4:4,
ticktext = c("Overprediction", '-3', '-2', '-1', '0', '1', '2', '3', 'Underprediction')
)
) %>%
layout(
title = "Validation Errors",
geo = list(
scope = 'usa',
projection = list(type = 'alvers usa')
)
)